-
Notifications
You must be signed in to change notification settings - Fork 14.7k
[Clang][X86] Replace F16C vcvtph2ps/256
intrinsics with (convert|shuffle)vector
builtins
#152911
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
[Clang][X86] Replace F16C vcvtph2ps/256
intrinsics with (convert|shuffle)vector
builtins
#152911
Conversation
…onvertvector The following intrinsics were replaced by a combination of `__builtin_shufflevector` and `__builtin_convertvector`: - `__builtin_ia32_vcvtph2ps` - `__builtin_ia32_vcvtph2ps256` Fixes llvm#152749
@llvm/pr-subscribers-backend-x86 @llvm/pr-subscribers-clang Author: None (moorabbit) ChangesThe following intrinsics were replaced by a combination of
Fixes #152749 Full diff: https://github.com/llvm/llvm-project/pull/152911.diff 6 Files Affected:
diff --git a/clang/include/clang/Basic/BuiltinsX86.td b/clang/include/clang/Basic/BuiltinsX86.td
index 3efc0be8fa698..fc1ee3be7889f 100644
--- a/clang/include/clang/Basic/BuiltinsX86.td
+++ b/clang/include/clang/Basic/BuiltinsX86.td
@@ -757,14 +757,6 @@ let Features = "f16c", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] i
def vcvtps2ph256 : X86Builtin<"_Vector<8, short>(_Vector<8, float>, _Constant int)">;
}
-let Features = "f16c", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
- def vcvtph2ps : X86Builtin<"_Vector<4, float>(_Vector<8, short>)">;
-}
-
-let Features = "f16c", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
- def vcvtph2ps256 : X86Builtin<"_Vector<8, float>(_Vector<8, short>)">;
-}
-
let Features = "rdrnd", Attributes = [NoThrow] in {
def rdrand16_step : X86Builtin<"unsigned int(unsigned short *)">;
def rdrand32_step : X86Builtin<"unsigned int(unsigned int *)">;
diff --git a/clang/lib/CodeGen/TargetBuiltins/X86.cpp b/clang/lib/CodeGen/TargetBuiltins/X86.cpp
index b508709e4bbae..f8d451bd20fa3 100644
--- a/clang/lib/CodeGen/TargetBuiltins/X86.cpp
+++ b/clang/lib/CodeGen/TargetBuiltins/X86.cpp
@@ -2841,8 +2841,6 @@ Value *CodeGenFunction::EmitX86BuiltinExpr(unsigned BuiltinID,
return getCmpIntrinsicCall(Intrinsic::x86_sse2_cmp_sd, 7);
// f16c half2float intrinsics
- case X86::BI__builtin_ia32_vcvtph2ps:
- case X86::BI__builtin_ia32_vcvtph2ps256:
case X86::BI__builtin_ia32_vcvtph2ps_mask:
case X86::BI__builtin_ia32_vcvtph2ps256_mask:
case X86::BI__builtin_ia32_vcvtph2ps512_mask: {
diff --git a/clang/lib/Headers/emmintrin.h b/clang/lib/Headers/emmintrin.h
index 60d2000dfb809..9b5567396f60b 100644
--- a/clang/lib/Headers/emmintrin.h
+++ b/clang/lib/Headers/emmintrin.h
@@ -40,12 +40,17 @@ typedef signed char __v16qs __attribute__((__vector_size__(16)));
#ifdef __SSE2__
/* Both _Float16 and __bf16 require SSE2 being enabled. */
+typedef _Float16 __v4hf __attribute__((__vector_size__(8)));
typedef _Float16 __v8hf __attribute__((__vector_size__(16), __aligned__(16)));
typedef _Float16 __m128h __attribute__((__vector_size__(16), __aligned__(16)));
typedef _Float16 __m128h_u __attribute__((__vector_size__(16), __aligned__(1)));
typedef __bf16 __v8bf __attribute__((__vector_size__(16), __aligned__(16)));
typedef __bf16 __m128bh __attribute__((__vector_size__(16), __aligned__(16)));
+#else
+/* Use __fp16 when _Float16 is not supported. */
+typedef __fp16 __v4hf __attribute__((__vector_size__(8)));
+typedef __fp16 __v8hf __attribute__((__vector_size__(16), __aligned__(16)));
#endif
/* Define the default attributes for the functions in this file. */
diff --git a/clang/lib/Headers/f16cintrin.h b/clang/lib/Headers/f16cintrin.h
index 94a662c1d93a8..98b25f5a56953 100644
--- a/clang/lib/Headers/f16cintrin.h
+++ b/clang/lib/Headers/f16cintrin.h
@@ -39,7 +39,8 @@ static __inline float __DEFAULT_FN_ATTRS128
_cvtsh_ss(unsigned short __a)
{
__v8hi __v = {(short)__a, 0, 0, 0, 0, 0, 0, 0};
- __v4sf __r = __builtin_ia32_vcvtph2ps(__v);
+ __v4hi __w = __builtin_shufflevector(__v, __v, 0, 1, 2, 3);
+ __v4sf __r = __builtin_convertvector((__v4hf)__w, __v4sf);
return __r[0];
}
@@ -109,7 +110,8 @@ _cvtsh_ss(unsigned short __a)
static __inline __m128 __DEFAULT_FN_ATTRS128
_mm_cvtph_ps(__m128i __a)
{
- return (__m128)__builtin_ia32_vcvtph2ps((__v8hi)__a);
+ __v4hi __v = __builtin_shufflevector((__v8hi)__a, (__v8hi)__a, 0, 1, 2, 3);
+ return __builtin_convertvector((__v4hf)__v, __v4sf);
}
/// Converts a 256-bit vector of [8 x float] into a 128-bit vector
@@ -153,7 +155,7 @@ _mm_cvtph_ps(__m128i __a)
static __inline __m256 __DEFAULT_FN_ATTRS256
_mm256_cvtph_ps(__m128i __a)
{
- return (__m256)__builtin_ia32_vcvtph2ps256((__v8hi)__a);
+ return __builtin_convertvector((__v8hf)__a, __v8sf);
}
#undef __DEFAULT_FN_ATTRS128
diff --git a/clang/test/CodeGen/X86/f16c-builtins-constrained.c b/clang/test/CodeGen/X86/f16c-builtins-constrained.c
index bbd4d8f83b53a..5716f09a677f5 100644
--- a/clang/test/CodeGen/X86/f16c-builtins-constrained.c
+++ b/clang/test/CodeGen/X86/f16c-builtins-constrained.c
@@ -16,7 +16,7 @@ float test_cvtsh_ss(unsigned short a) {
// CHECK: insertelement <8 x i16> %{{.*}}, i16 0, i32 5
// CHECK: insertelement <8 x i16> %{{.*}}, i16 0, i32 6
// CHECK: insertelement <8 x i16> %{{.*}}, i16 0, i32 7
- // CHECK: shufflevector <8 x i16> %{{.*}}, <8 x i16> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+ // CHECK: shufflevector <8 x i16> %{{.*}}, <8 x i16> %{{.*}}, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
// CHECK: call <4 x float> @llvm.experimental.constrained.fpext.v4f32.v4f16(<4 x half> %{{.*}}, metadata !"fpexcept.strict")
// CHECK: extractelement <4 x float> %{{.*}}, i32 0
return _cvtsh_ss(a);
@@ -38,7 +38,7 @@ unsigned short test_cvtss_sh(float a) {
__m128 test_mm_cvtph_ps(__m128i a) {
// CHECK-LABEL: test_mm_cvtph_ps
- // CHECK: shufflevector <8 x i16> %{{.*}}, <8 x i16> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+ // CHECK: shufflevector <8 x i16> %{{.*}}, <8 x i16> %{{.*}}, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
// CHECK: call {{.*}}<4 x float> @llvm.experimental.constrained.fpext.v4f32.v4f16(<4 x half> %{{.*}}, metadata !"fpexcept.strict")
return _mm_cvtph_ps(a);
}
diff --git a/clang/test/CodeGen/X86/f16c-builtins.c b/clang/test/CodeGen/X86/f16c-builtins.c
index 3c6d64c225b32..61ffa24e6f5c7 100644
--- a/clang/test/CodeGen/X86/f16c-builtins.c
+++ b/clang/test/CodeGen/X86/f16c-builtins.c
@@ -16,7 +16,7 @@ float test_cvtsh_ss(unsigned short a) {
// CHECK: insertelement <8 x i16> %{{.*}}, i16 0, i32 5
// CHECK: insertelement <8 x i16> %{{.*}}, i16 0, i32 6
// CHECK: insertelement <8 x i16> %{{.*}}, i16 0, i32 7
- // CHECK: shufflevector <8 x i16> %{{.*}}, <8 x i16> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+ // CHECK: shufflevector <8 x i16> %{{.*}}, <8 x i16> %{{.*}}, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
// CHECK: fpext <4 x half> %{{.*}} to <4 x float>
// CHECK: extractelement <4 x float> %{{.*}}, i32 0
return _cvtsh_ss(a);
@@ -35,7 +35,7 @@ unsigned short test_cvtss_sh(float a) {
__m128 test_mm_cvtph_ps(__m128i a) {
// CHECK-LABEL: test_mm_cvtph_ps
- // CHECK: shufflevector <8 x i16> %{{.*}}, <8 x i16> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+ // CHECK: shufflevector <8 x i16> %{{.*}}, <8 x i16> %{{.*}}, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
// CHECK: fpext <4 x half> %{{.*}} to <4 x float>
return _mm_cvtph_ps(a);
}
|
@llvm/pr-subscribers-clang-codegen Author: None (moorabbit) ChangesThe following intrinsics were replaced by a combination of
Fixes #152749 Full diff: https://github.com/llvm/llvm-project/pull/152911.diff 6 Files Affected:
diff --git a/clang/include/clang/Basic/BuiltinsX86.td b/clang/include/clang/Basic/BuiltinsX86.td
index 3efc0be8fa698..fc1ee3be7889f 100644
--- a/clang/include/clang/Basic/BuiltinsX86.td
+++ b/clang/include/clang/Basic/BuiltinsX86.td
@@ -757,14 +757,6 @@ let Features = "f16c", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] i
def vcvtps2ph256 : X86Builtin<"_Vector<8, short>(_Vector<8, float>, _Constant int)">;
}
-let Features = "f16c", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
- def vcvtph2ps : X86Builtin<"_Vector<4, float>(_Vector<8, short>)">;
-}
-
-let Features = "f16c", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
- def vcvtph2ps256 : X86Builtin<"_Vector<8, float>(_Vector<8, short>)">;
-}
-
let Features = "rdrnd", Attributes = [NoThrow] in {
def rdrand16_step : X86Builtin<"unsigned int(unsigned short *)">;
def rdrand32_step : X86Builtin<"unsigned int(unsigned int *)">;
diff --git a/clang/lib/CodeGen/TargetBuiltins/X86.cpp b/clang/lib/CodeGen/TargetBuiltins/X86.cpp
index b508709e4bbae..f8d451bd20fa3 100644
--- a/clang/lib/CodeGen/TargetBuiltins/X86.cpp
+++ b/clang/lib/CodeGen/TargetBuiltins/X86.cpp
@@ -2841,8 +2841,6 @@ Value *CodeGenFunction::EmitX86BuiltinExpr(unsigned BuiltinID,
return getCmpIntrinsicCall(Intrinsic::x86_sse2_cmp_sd, 7);
// f16c half2float intrinsics
- case X86::BI__builtin_ia32_vcvtph2ps:
- case X86::BI__builtin_ia32_vcvtph2ps256:
case X86::BI__builtin_ia32_vcvtph2ps_mask:
case X86::BI__builtin_ia32_vcvtph2ps256_mask:
case X86::BI__builtin_ia32_vcvtph2ps512_mask: {
diff --git a/clang/lib/Headers/emmintrin.h b/clang/lib/Headers/emmintrin.h
index 60d2000dfb809..9b5567396f60b 100644
--- a/clang/lib/Headers/emmintrin.h
+++ b/clang/lib/Headers/emmintrin.h
@@ -40,12 +40,17 @@ typedef signed char __v16qs __attribute__((__vector_size__(16)));
#ifdef __SSE2__
/* Both _Float16 and __bf16 require SSE2 being enabled. */
+typedef _Float16 __v4hf __attribute__((__vector_size__(8)));
typedef _Float16 __v8hf __attribute__((__vector_size__(16), __aligned__(16)));
typedef _Float16 __m128h __attribute__((__vector_size__(16), __aligned__(16)));
typedef _Float16 __m128h_u __attribute__((__vector_size__(16), __aligned__(1)));
typedef __bf16 __v8bf __attribute__((__vector_size__(16), __aligned__(16)));
typedef __bf16 __m128bh __attribute__((__vector_size__(16), __aligned__(16)));
+#else
+/* Use __fp16 when _Float16 is not supported. */
+typedef __fp16 __v4hf __attribute__((__vector_size__(8)));
+typedef __fp16 __v8hf __attribute__((__vector_size__(16), __aligned__(16)));
#endif
/* Define the default attributes for the functions in this file. */
diff --git a/clang/lib/Headers/f16cintrin.h b/clang/lib/Headers/f16cintrin.h
index 94a662c1d93a8..98b25f5a56953 100644
--- a/clang/lib/Headers/f16cintrin.h
+++ b/clang/lib/Headers/f16cintrin.h
@@ -39,7 +39,8 @@ static __inline float __DEFAULT_FN_ATTRS128
_cvtsh_ss(unsigned short __a)
{
__v8hi __v = {(short)__a, 0, 0, 0, 0, 0, 0, 0};
- __v4sf __r = __builtin_ia32_vcvtph2ps(__v);
+ __v4hi __w = __builtin_shufflevector(__v, __v, 0, 1, 2, 3);
+ __v4sf __r = __builtin_convertvector((__v4hf)__w, __v4sf);
return __r[0];
}
@@ -109,7 +110,8 @@ _cvtsh_ss(unsigned short __a)
static __inline __m128 __DEFAULT_FN_ATTRS128
_mm_cvtph_ps(__m128i __a)
{
- return (__m128)__builtin_ia32_vcvtph2ps((__v8hi)__a);
+ __v4hi __v = __builtin_shufflevector((__v8hi)__a, (__v8hi)__a, 0, 1, 2, 3);
+ return __builtin_convertvector((__v4hf)__v, __v4sf);
}
/// Converts a 256-bit vector of [8 x float] into a 128-bit vector
@@ -153,7 +155,7 @@ _mm_cvtph_ps(__m128i __a)
static __inline __m256 __DEFAULT_FN_ATTRS256
_mm256_cvtph_ps(__m128i __a)
{
- return (__m256)__builtin_ia32_vcvtph2ps256((__v8hi)__a);
+ return __builtin_convertvector((__v8hf)__a, __v8sf);
}
#undef __DEFAULT_FN_ATTRS128
diff --git a/clang/test/CodeGen/X86/f16c-builtins-constrained.c b/clang/test/CodeGen/X86/f16c-builtins-constrained.c
index bbd4d8f83b53a..5716f09a677f5 100644
--- a/clang/test/CodeGen/X86/f16c-builtins-constrained.c
+++ b/clang/test/CodeGen/X86/f16c-builtins-constrained.c
@@ -16,7 +16,7 @@ float test_cvtsh_ss(unsigned short a) {
// CHECK: insertelement <8 x i16> %{{.*}}, i16 0, i32 5
// CHECK: insertelement <8 x i16> %{{.*}}, i16 0, i32 6
// CHECK: insertelement <8 x i16> %{{.*}}, i16 0, i32 7
- // CHECK: shufflevector <8 x i16> %{{.*}}, <8 x i16> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+ // CHECK: shufflevector <8 x i16> %{{.*}}, <8 x i16> %{{.*}}, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
// CHECK: call <4 x float> @llvm.experimental.constrained.fpext.v4f32.v4f16(<4 x half> %{{.*}}, metadata !"fpexcept.strict")
// CHECK: extractelement <4 x float> %{{.*}}, i32 0
return _cvtsh_ss(a);
@@ -38,7 +38,7 @@ unsigned short test_cvtss_sh(float a) {
__m128 test_mm_cvtph_ps(__m128i a) {
// CHECK-LABEL: test_mm_cvtph_ps
- // CHECK: shufflevector <8 x i16> %{{.*}}, <8 x i16> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+ // CHECK: shufflevector <8 x i16> %{{.*}}, <8 x i16> %{{.*}}, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
// CHECK: call {{.*}}<4 x float> @llvm.experimental.constrained.fpext.v4f32.v4f16(<4 x half> %{{.*}}, metadata !"fpexcept.strict")
return _mm_cvtph_ps(a);
}
diff --git a/clang/test/CodeGen/X86/f16c-builtins.c b/clang/test/CodeGen/X86/f16c-builtins.c
index 3c6d64c225b32..61ffa24e6f5c7 100644
--- a/clang/test/CodeGen/X86/f16c-builtins.c
+++ b/clang/test/CodeGen/X86/f16c-builtins.c
@@ -16,7 +16,7 @@ float test_cvtsh_ss(unsigned short a) {
// CHECK: insertelement <8 x i16> %{{.*}}, i16 0, i32 5
// CHECK: insertelement <8 x i16> %{{.*}}, i16 0, i32 6
// CHECK: insertelement <8 x i16> %{{.*}}, i16 0, i32 7
- // CHECK: shufflevector <8 x i16> %{{.*}}, <8 x i16> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+ // CHECK: shufflevector <8 x i16> %{{.*}}, <8 x i16> %{{.*}}, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
// CHECK: fpext <4 x half> %{{.*}} to <4 x float>
// CHECK: extractelement <4 x float> %{{.*}}, i32 0
return _cvtsh_ss(a);
@@ -35,7 +35,7 @@ unsigned short test_cvtss_sh(float a) {
__m128 test_mm_cvtph_ps(__m128i a) {
// CHECK-LABEL: test_mm_cvtph_ps
- // CHECK: shufflevector <8 x i16> %{{.*}}, <8 x i16> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+ // CHECK: shufflevector <8 x i16> %{{.*}}, <8 x i16> %{{.*}}, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
// CHECK: fpext <4 x half> %{{.*}} to <4 x float>
return _mm_cvtph_ps(a);
}
|
I couldn't get __builtin_convertvector to emit an fpext instruction instead of an sitofp when converting a 16-bit half-precision float into a 32-bit float. I tried to solve this by defining __v4hf and __v8hf types (vectors of 4 and 8 half-precision floats) and casting the operand of __builtin_convertvector to one of them, so that Clang recognizes the vector elements as floating types rather than integers. |
clang/lib/Headers/f16cintrin.h
Outdated
@@ -39,7 +39,8 @@ static __inline float __DEFAULT_FN_ATTRS128 | |||
_cvtsh_ss(unsigned short __a) | |||
{ | |||
__v8hi __v = {(short)__a, 0, 0, 0, 0, 0, 0, 0}; | |||
__v4sf __r = __builtin_ia32_vcvtph2ps(__v); | |||
__v4hi __w = __builtin_shufflevector(__v, __v, 0, 1, 2, 3); | |||
__v4sf __r = __builtin_convertvector((__v4hf)__w, __v4sf); | |||
return __r[0]; |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Does this work consistently? I haven't properly compared the final asm at different -O levels.
float _cvtsh_ss(unsigned short __a)
{
return (float)__builtin_bit_cast(_Float16, __a);
}
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Thx, LGTM: https://godbolt.org/z/Pcr7aYeKE.
I wasn't aware of the builtin.
clang/lib/Headers/emmintrin.h
Outdated
#else | ||
/* Use __fp16 when _Float16 is not supported. */ | ||
typedef __fp16 __v4hf __attribute__((__vector_size__(8))); | ||
typedef __fp16 __v8hf __attribute__((__vector_size__(16), __aligned__(16))); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I don't understand why this is necessary?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Some tests were failing due to _Float16
not being supported on i686-*
.
Maybe it's better to just add the -target-feature +sse2
flag in the failing tests to force support of _Float16
?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
We intend request SSE2 for _Float16
type.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Would we be better off just defining the types inside the intrinsics?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
That sounds better. Thx!
I'm also using |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
one final minor
clang/lib/Headers/f16cintrin.h
Outdated
typedef __fp16 __v4fp16 __attribute__((__vector_size__(8))); | ||
|
||
__v4hi __v = __builtin_shufflevector((__v8hi)__a, (__v8hi)__a, 0, 1, 2, 3); | ||
return __builtin_convertvector((__v4fp16)__v, __v4sf); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
return (__m128)__builtin_convertvector((__v4fp16)__v, __v4sf);
clang/lib/Headers/f16cintrin.h
Outdated
return (__m256)__builtin_ia32_vcvtph2ps256((__v8hi)__a); | ||
typedef __fp16 __v8fp16 __attribute__((__vector_size__(16), __aligned__(16))); | ||
|
||
return __builtin_convertvector((__v8fp16)__a, __v8sf); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
return (__m256)__builtin_convertvector((__v8fp16)__a, __v8sf);
✅ With the latest revision this PR passed the C/C++ code formatter. |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
LGTM - cheers
Thx for the review! FYI, I don't have commit access yet, so I can't merge my PR. Someone needs to do it on my behalf. |
The following intrinsics were replaced by a combination of
__builtin_shufflevector
and__builtin_convertvector
:__builtin_ia32_vcvtph2ps
__builtin_ia32_vcvtph2ps256
Fixes #152749